In [None]:
import pytz
import datetime
import marimo as mo

india_timezone = pytz.timezone("Asia/Kolkata")
now = datetime.datetime.now(india_timezone)

curr = now.strftime("%Y-%m-%d, %I:%M:%S %p %Z")

mo.md(
    rf"""
# Week - 9

**Submission Date:** `2025-11-30, 23:59 IST`

**Last Updated:** `{curr}`
"""
)

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import accuracy_score
from scipy.stats import mode

In [None]:
mo.md(r"""
## Data Set Information:

Live selling is becoming increasingly popular in Asian countries. Small vendors can now reach a wider audience and connect with many customers. Analyze The variability of consumer engagement on Facebook Live data which can help sellers to build selling approach and activities for the company.
""").callout()

# Assignment Information

**(Consider the statement for Q1- Q8) Load the dataset using following link**

Url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00488/Live_20210128.csv"

Check if any feature contains **Null** values.

Drop all the features which have **Null** values.

Save `'status_type'` as target variable.

Drop the feature `"status_id"`, `"status_type"` and `"status_published"` from training set.

Use LabelEncoder to transform the target variable.

Use standard scaler to scale the features.

**For Q4,Q5** Train the model using Kmeans clustering (Take Random state=10)

**For Q6,Q7,Q8** Train the model using Agglomerative Clustering by setting the parameter as following

In [None]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00488/Live_20210128.csv"
)

In [None]:
X = df.drop("status_type", axis=1)
y = df["status_type"]

## Question 1

What is the shape of the data?

In [None]:
df.shape

## Question 2

How many feature contain Null values?

In [None]:
(X.isnull().sum() > 0).sum()

## Question 3

Column '`status_type`' has _ _ _ _ (number) unique values.

In [None]:
df["status_type"].unique().size

## Question 4 - 8

### Preprocessing Steps

In [None]:
for _k, _v in X.isnull().sum().items():
    if _v > 0:
        X.drop(_k, axis=1, inplace=True)

In [None]:
X.drop(["status_id", "status_published"], axis=1, inplace=True)

In [None]:
for _col in X.select_dtypes(include="object").columns:
    le = LabelEncoder()
    X[_col] = le.fit_transform(X[_col])

y_true = LabelEncoder().fit_transform(y)

In [None]:
ss = StandardScaler().set_output(transform="pandas")
X_trans = ss.fit_transform(X)

### Question 4 - 5

For **Q4**, **Q5** Train the model using Kmeans clustering (Take **Random state=10**)

### Question 4

Enter the inertia score at **k=2**

In [None]:
_kmeans = KMeans(2, random_state=10, n_init=10)
_kmeans.fit(X_trans)

_kmeans.inertia_

### Question 5

How many labels were predicted accurately at k=4?

In [None]:
_kmeans = KMeans(4, random_state=10, n_init=10)
_labels = _kmeans.fit_predict(X_trans)

(_labels == y_true).sum()

## Question 6 - 8

In [None]:
aggo = AgglomerativeClustering(
    n_clusters=4, metric="euclidean", linkage="ward"
)
aggo.fit(X_trans, y)

### Question 6

What is the label predicted for first row of samples?

In [None]:
aggo.labels_[0]

### Question 7

Enter the number of leaves in the hierarchical tree.

In [None]:
aggo.n_leaves_

### Question 8

What is the accuracy of the model(in Percentage)?

In [None]:
accuracy_score(y_true, aggo.labels_) * 100